In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()
The problem with weighting the interactions in the curated edgelist we are using to build our protein-protein interaction network is that the classifier is not perfect and is supplying extremely low probabilities for interactions which we expect to exist. We can encode this prior into our model by considering inclusion in a database, the edgelist and the result of classification to both be events that are dependent on an underlying interaction event. If we assume the events are also conditionally independent then we have a Naive Bayes model.
Specifically, if we have $I$ as interaction $c$ as classifier result and $e$ as edgelist inclusion result the joint can be expressed:
$$ p(I,e,c) = p(I) p(e|I) p(c|I) $$Where the individual distributions are:
$$ p(I=1) = Bernoulli(I=1;\theta_{I}) = \theta_{I} $$$$ p(e|I=1) = Bernoulli(e;\theta_{e}) = \theta_{e}^{e}(1-\theta_{e})^{1-e} $$$$ p(c|I=1) = Beta(c;\alpha,\beta) = \frac{1}{B(\alpha,\beta)} c^{\alpha-1}(1-c)^{\beta-1} $$There are parameters we need to find to be able to use this model: $\alpha, \beta, \theta_{I}, \pmb{\theta_{e}}$. We can easily get two of these:
First we have to load the data:
In [1]:
import scipy.stats
In [3]:
cd ../../forGAVIN/mergecode/OUT/
In [4]:
interactions = loadtxt("edgelist_weighted.txt",dtype=str)
In [5]:
interactions = interactions[1:]
In [6]:
weights = interactions[:,2]
weights = weights.astype(np.float)
Now, finding the indexes of positive and negative examples:
In [7]:
import pickle
In [8]:
#unpickle classifier's samples
f = open("../../../features/random.forest.bayes.dist.samples.pickle")
samples = pickle.load(f)
f.close()
In [12]:
h = hist(samples[1][:,1],bins=50)
In [13]:
h = hist(samples[0][:,1],bins=100)
In [82]:
#regularisation samples
posregsamp = scipy.stats.beta.rvs(1,1,size=10)
negregsamp = scipy.stats.beta.rvs(1,1,size=10)
In [90]:
idx1 = random_integers(0,len(samples[1][:,1])-1,size=1000)
idx0 = random_integers(0,len(samples[0][:,1])-1,size=1000)
In [91]:
alpha_1,beta_1,_,_ = scipy.stats.beta.fit(hstack([samples[1][:,1][idx1],posregsamp]))
In [92]:
alpha_0,beta_0,_,_ = scipy.stats.beta.fit(hstack([samples[0][:,1][idx0],negregsamp]))
In [93]:
x = linspace(0,1,500)
In [94]:
b1 = scipy.stats.beta.pdf(x,alpha_1,beta_1)
b0 = scipy.stats.beta.pdf(x,alpha_0,beta_0)
Have to use logarithmic scaling in the following plot to yield a readable graph:
In [95]:
plot(x,b1,label="interaction")
plot(x,b0,label="non-interaction")
legend(loc=0)
Out[95]:
In [76]:
import os
In [77]:
plotpath = os.path.abspath("../../../plots/bayes/")
In [78]:
savez(os.path.join(plotpath,"rf.beta.npz"),x,b1,b0)
In [26]:
import pickle
In [27]:
sys.path.append("../../../opencast-bio/")
In [28]:
import ocbio.extract
In [29]:
f = open("../../../string/human.Entrez.string.pickle")
stringdb = pickle.load(f)
f.close()
In [30]:
f = open("../../../iRefIndex/human.iRefIndex.Entrez.1ofk.pickle")
irefdb = pickle.load(f)
f.close()
In [31]:
stringlabelled = {}
for k in stringdb.featuredict:
try:
stringlabelled[max(irefdb[k])] += [float(stringdb[k][-1])/1000]
except KeyError:
stringlabelled[max(irefdb[k])] = [float(stringdb[k][-1])/1000]
In [32]:
h=hist(stringlabelled['1'],bins=50,label='interaction')
h=hist(stringlabelled['0'],bins=50,label='non-interaction')
l=legend()
In [85]:
#regularisation samples
posregsamp = scipy.stats.beta.rvs(1,1,size=10)
negregsamp = scipy.stats.beta.rvs(1,1,size=10)
In [58]:
stringlabelled['1'] = array(stringlabelled['1'])
stringlabelled['0'] = array(stringlabelled['0'])
In [46]:
N1 = len(stringlabelled['1'])
N2 = len(stringlabelled['0'])
In [86]:
s1idx = random_integers(0,N1,size=1000)
s2idx = random_integers(0,N2,size=1000)
In [87]:
stralpha_1,strbeta_1,_,_ = scipy.stats.beta.fit(hstack([stringlabelled['1'][s1idx],posregsamp]))
stralpha_0,strbeta_0,_,_ = scipy.stats.beta.fit(hstack([stringlabelled['0'][s2idx],negregsamp]))
In [111]:
b1 = scipy.stats.beta.pdf(x,stralpha_1,strbeta_1)
b0 = scipy.stats.beta.pdf(x,stralpha_0,strbeta_0)
In [112]:
plot(x,b1,label="interaction")
plot(x,b0,label="non-interaction")
l=legend(loc=0)
In [113]:
savez(os.path.join(plotpath,"string.beta.npz"),x,b1,b0)
Using these distributions we can find the posterior probabilities for each of the interactions in this edgelist:
$$ p(I=1|e,c;\theta_{e},\alpha,\beta) = \frac{p(I=1,e,c;\theta_{e},\alpha,\beta)}{p(e,c;\theta_{e},\alpha,\beta)} = \frac{p(I=1)p(e|I=1;\theta_{e})p(c|I=1;\alpha,\beta)}{p(e,c;\theta_{e},\alpha,\beta)} $$Where we find the denominator by marginalising the joint:
$$ p(e,c;\theta_{e},\alpha,\beta) = \sum_{I} p(I,e,c;\theta_{e},\alpha,\beta) $$Which requires using the distributions of:
$$ p(I=0) = Bernoulli(I=1;\theta_{I}) = 1-\theta_{I} $$$$ p(e|I=0) = Bernoulli(e;\theta_{e}) = \theta_{e0}^{e}(1-\theta_{e0})^{1-e} $$$$ p(c|I=0) = Beta(c;\alpha_{0},\beta_{0}) = \frac{1}{B(\alpha_{0},\beta_{0})} c^{\alpha_{0}-1}(1-c)^{\beta_{0}-1} $$In this case, as $e$ is always one (all examples are elements of the edgelist) $\theta_{e0}$ is simply $1 - \theta_{e}$. Therefore, we can solve for the posterior for arbitrary points:
In [38]:
def posterior(prior,logprobdict):
"""Takes dictionary of class-conditional log probabilities and log prior
produces the corresponding posterior probability"""
#dictionary should have keys 0 and 1
post = []
for l1,l0 in zip(logprobdict[1],logprobdict[0]):
numerator = sum(l1) + prior
denominator = logaddexp(numerator,sum(l0))
post.append(exp(numerator-denominator))
return post
In [39]:
alpha = {0:alpha_0,1:alpha_1}
beta = {0:beta_0,1:beta_1}
In [40]:
stralpha = {0:stralpha_0,1:stralpha_1}
strbeta = {0:strbeta_0,1:strbeta_1}
In the below cell we will be explicitly defining various probabilities for the different data sources. The values used will be estimates, aiming to be conservative. For each database two different probabilties must be defined:
This only equates to two probabilities as these probabilities are related within each class:
$$ TPR = 1 - FPR $$We can define these probabilities for each of these databases before we begin for iRefIndex and edgelist inclusion.
In [103]:
irefprob = {'tpr':0.9,'tnr':0.5,'fpr':0.1,'fnr':0.5}
edgelistprob = {'tpr':0.9,'tnr':0.5,'fpr':0.1,'fnr':0.5}
In [104]:
#making logprobdict from classifier, STRING and iRefIndex
logprobdict = {}
for cls in [0,1]:
for l in interactions:
pair = frozenset(l[:2])
#rounding problems
weight = float(l[2])*0.9 + 0.01
pclass = scipy.stats.beta.logpdf(weight,alpha[cls],beta[cls])
wstring = (float(stringdb[pair][-1])/1000)*0.9 + 0.09999
pstring = scipy.stats.beta.logpdf(wstring,stralpha[cls],strbeta[cls])
irefresponse = max(map(float,irefdb[pair]))
if cls == 1:
piref = log(irefresponse*irefprob['tpr'] + (1.0-irefresponse)*irefprob['fnr'])
pedgelist = log(edgelistprob['tpr'])
else:
piref = log(irefresponse*irefprob['fpr'] + (1.0-irefresponse)*irefprob['tnr'])
pedgelist = log(edgelistprob['fpr'])
try:
logprobdict[cls] += [sum([pclass,pstring,pedgelist,piref])]
except KeyError:
logprobdict[cls] = [sum([pclass,pstring,pedgelist,piref])]
In [105]:
postweightings = posterior(1.0/600,logprobdict)
In [106]:
h=hist(postweightings,bins=100)
Quite a discrete set of weights, probably due to all these binary features we're using.
In [122]:
!git annex unlock ../../../plots/bayes/postweightings.npz
In [123]:
savez(os.path.join(plotpath,"postweightings.npz"),postweightings)
In [107]:
import csv
In [109]:
!git annex unlock edgelist_update_weighted.txt
In [110]:
f = open("edgelist_update_weighted.txt","w")
c = csv.writer(f, delimiter="\t")
for i,p in enumerate(interactions[:,:2]):
c.writerow(list(p)+[postweightings[i]])
f.close()